// DESC: These subroutines optimize the hyperparameters of/for a Gaussian process; currently (7/25/2015) these include the hyperparameters of the covariance function + the noise in the data. On input and output, these hyperparameters are passed separately; internally though, they are combined into a single vector.


// STL
#include <memory>    // std::unique_ptr<>
//#include <stdexcept> // std::runtime_error()
//#include <tuple>     // std::tie()
#include <utility>   // std::pair<>, std::make_pair()
#include <vector>    // std::vector<>

// jScience
#include "jScience/linalg/Matrix.hpp" // Matrix<>
#include "jScience/linalg/Vector.hpp" // Vector<>
#include "jminimization.h"            // sim_anneal()

// stats++
#include "statsxx/optimization/gradient_based/RPROP.hpp" // RPROP
#include "statsxx/optimization/line_search_methods/ConjugateGradient.hpp" // ConjugateGradient
#include "statsxx/statistics/CovarianceFunction.hpp" // CovarianceFunction


static double fwrapV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const Vector<double> &x, const int n);
static Vector<double> dfwrapV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Vector<double> &x, const int n);

static double fwrapVv(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const std::vector<double> &x, const int n);

static double fwrapVvV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const std::vector<double> &x);

// wrapper functions for f(x) and f'(x)
static double f(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const Vector<double> &theta);
static Vector<double> df(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Vector<double> &theta);

static std::pair<std::vector<double>,
                 double> convert(const Vector<double> &x);


//
// DESC: (de-)Normalizes data to lie in the range [0,1].
//
inline std::pair<
                 std::vector<double>,
                 double
                 > Gaussian_process::optimize_hyperparameters(std::unique_ptr<CovarianceFunction> const &k, std::vector<double> theta, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, double sigma_n)
{
    std::cout << "here 1y" << std::endl;

    // setup wrapper functions
    std::function<double(const Vector<double> &)> fwrap = std::bind(f, std::cref(k), std::cref(X), std::cref(y), std::cref(X_val), std::cref(y_val), std::placeholders::_1);
    std::function<Vector<double>(const Vector<double> &)> dfwrap = std::bind(df, std::cref(k), std::cref(X), std::cref(y), std::placeholders::_1);

    // create a single hyperparameters vector (covariance function + Gaussian process noise)
    theta.push_back(sigma_n);
    Vector<double> x(theta);

    // minimize
    int info;
    double fx;
//    ConjugateGradient cg;
//    std::tie(info, x, fx) = cg.minimize(fwrap, dfwrap, x);
//    RPROP rp;
//    rp.iter_max = 20;
//    std::tie(info, x, fx) = rp.minimize(fwrap, dfwrap, x);
    // ******

    // ******
    // ******
    // ******

    // THE FOLLOWING ASSIGNS A *SINGLE* OPTIMIZABLE LENGTH SCALE TO ALL DIMENSIONS OF DATA

    int n = x.size();

//    std::function<double(const Vector<double> &)> fwrap3 = std::bind(fwrapV, std::cref(k), std::cref(X), std::cref(y), std::placeholders::_1, std::cref(n));
//    std::function<Vector<double>(const Vector<double> &)> dfwrap3 = std::bind(dfwrapV, std::cref(k), std::cref(X), std::cref(y), std::placeholders::_1, std::cref(n));

    Vector<double> tmpx(3);
    tmpx(0) = x(0);
    tmpx(1) = x((n-2));
    tmpx(2) = x((n-1));

    std::cout << "here 2y" << std::endl;

//    ConjugateGradient cg;
//    std::tie(info, tmpx, fx) = cg.minimize(fwrap3, dfwrap3, tmpx);
//    RPROP rp;
//    rp.iter_max = 20;
//    std::tie(info, tmpx, fx) = rp.minimize(fwrap3, dfwrap3, tmpx);


    // simulated annealing:

    std::function<double(const std::vector<double> &)> fwrap4 = std::bind(fwrapVv, std::cref(k), std::cref(X), std::cref(y), std::cref(X_val), std::cref(y_val), std::placeholders::_1, std::cref(n));

    double eps = 0.01;

//    double T = 20.0;
    double T = -1.0; // determine optimum
    int N_T = 10;
    int N_S = 5;

    bool silent = false;

    int xtype[3];
    double a[3];
    double b[3];
    double v[3];

    xtype[0] = 1;
    xtype[1] = 1;
    xtype[2] = 1;

    a[0] = 1.e-8;
    b[0] = 1000.0;
    v[0] = 0.1;

    a[1] = 1.e-8;
    b[1] = 100.0;
    v[1] = 0.1;

    a[2] = 0.0;
    b[2] = 0.1;
    v[2] = 1.0e-6;

    std::cout << "here 3y" << std::endl;

    std::vector<double> xv = tmpx.std_vector();

    sim_anneal(
               xv,
               3,
               xtype,
               a,
               b,
               fwrap4,
               T,
               v,
               eps,
               N_T,
               N_S,
               silent
               );

    std::cout << "here 4y" << std::endl;

    x = Vector<double>(n, xv[0]);
    x((n-2)) = xv[1];
    x((n-1)) = xv[2];

    // ******
    // ******
    // ******
    // ******

    // THE FOLLOWING ASSIGNS AN OPTIMIZABLE PARAMETER FOR EACH DIMENSIONS OF DATA --- SO SLOW, USE THE ABOVE IF YOU CAN

/*
    // simulated annealing:

    std::function<double(const std::vector<double> &)> fwrap5 = std::bind(fwrapVvV, std::cref(k), std::cref(X), std::cref(y), std::cref(X_val), std::cref(y_val), std::placeholders::_1);

    double eps = 0.01;

    //    double T = 20.0;
    double T = -1.0; // determine optimum
    int N_T = 10;
    int N_S = 5;

    bool silent = false;

    int xtype[x.size()];
    double a[x.size()];
    double b[x.size()];
    double v[x.size()];

    for(auto i = 0; i < x.size(); ++i)
    {
        xtype[i] = 1;
        a[i] = 0.01;
        b[i] = 100.0;
        v[i] = 0.1;
    }

    a[x.size()-2] = 0.1;
    b[x.size()-2] = 10.0;

    a[x.size()-1] = 0.0;
    b[x.size()-1] = 0.1;
    v[x.size()-1] = 1.0e-6;

    std::vector<double> xv = x.std_vector();

    sim_anneal(
               xv,
               x.size(),
               xtype,
               a,
               b,
               fwrap5,
               T,
               v,
               eps,
               N_T,
               N_S,
               silent
               );

    x = Vector<double>(xv);
*/
    // ******
    // ******
    // ******
    // ******

    // convert and return the components of the (single) hyperparameters vector
    return convert(x);
}

// *****


static double fwrapVvV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const std::vector<double> &x)
{
    return f(k, X, y, X_val, y_val, Vector<double>(x));
}

static double fwrapVv(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const std::vector<double> &x, const int n)
{
    return fwrapV(k, X, y, X_val, y_val, Vector<double>(x), n);
}


static double fwrapV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const Vector<double> &x, const int n)
{
    Vector<double> theta(n, x(0));
    theta((n-2)) = x(1);
    theta((n-1)) = x(2);

    return f(k, X, y, X_val, y_val, theta);
}

static Vector<double> dfwrapV(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Vector<double> &x, const int n)
{
    Vector<double> theta(n, x(0));
    theta((n-2)) = x(1);
    theta((n-1)) = x(2);

    Vector<double> vdf = df(k, X, y, theta);

    Vector<double> ret(3);
    ret(0) = theta(0);
    ret(1) = theta((n-2));
    ret(2) = theta((n-1));

    return ret;
}


// *****

static double f(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Matrix<double> &X_val, const Vector<double> &y_val, const Vector<double> &x)
{
    std::vector<double> theta;
    double sigma_n;
    std::tie(theta, sigma_n) = convert(x);

    std::unique_ptr<CovarianceFunction> knew = k->clone();
    knew->assign_hyperparameters(theta);

    Gaussian_process::GaussianProcess gp(knew);

    gp.add_data(X, y, sigma_n);

//    return -gp.likelihood();

    // ***

    std::ofstream ofs("GP_train.dat", std::ios::app);

    double L = gp.likelihood();

    double MSE = 0.0;

    for( auto i = 0; i < X.size(0); ++i )
    {
        double pred = gp.predict(X.row(i));

        MSE += ( (pred - y(i))*(pred - y(i)) );
    }

    MSE /= X.size(0);

    // compute the validation error too
    double MSE_val = 0.0;

    for( auto i = 0; i < X_val.size(0); ++i )
    {
        double pred = gp.predict(X_val.row(i));

        MSE_val += ( (pred - y_val(i))*(pred - y_val(i)) );
    }

    MSE_val /= X_val.size(0);

    ofs << L << "   " << MSE << "   " << MSE_val << '\n';

    ofs.close();

    return -L;
}


static Vector<double> df(std::unique_ptr<CovarianceFunction> const &k, const Matrix<double> &X, const Vector<double> &y, const Vector<double> &x)
{
    std::vector<double> theta;
    double sigma_n;
    std::tie(theta, sigma_n) = convert(x);

    std::unique_ptr<CovarianceFunction> knew = k->clone();
    knew->assign_hyperparameters(theta);

    Gaussian_process::GaussianProcess gp(knew);

    gp.add_data(X, y, sigma_n);

    return -Vector<double>(gp.dlikelihood());
}


static std::pair<std::vector<double>,
                 double> convert(const Vector<double> &x)
{
    std::vector<double> theta = x.std_vector();
    double sigma_n = theta.back();
    theta.pop_back();

    return std::make_pair(theta, sigma_n);
}
